@INPROCEEDINGS{similar, 
author={Tomov, S. and Nath, R. and Ltaief, H. and Dongarra, J.}, 
booktitle={Parallel Distributed Processing, Workshops and Phd Forum (IPDPSW), 2010 IEEE International Symposium on}, 
title={Dense linear algebra solvers for multicore with GPU accelerators}, 
year={2010}, 
pages={1-8}, 
keywords={coprocessors;linear algebra;mathematics computing;matrix decomposition;multiprocessing systems;optimisation;parallel programming;Cholesky factorization;GPU accelerators;LAPACK software;LU factorization;MAGMA library;QR factorization;algorithm-specific optimization;architecture-specific optimization;dense linear algebra solvers;graphics processing unit;hybridization techniques;multicore systems;optimized BLAS software;parallel programming model;Acceleration;Computer architecture;Equations;Iterative algorithms;Linear accelerators;Linear algebra;Linear systems;Multicore processing;Numerical simulation;Scientific computing;Dense Linear Algebra Solvers;GPU Accelerators;Hybrid Algorithms;MAGMA;Multicore}, 
doi={10.1109/IPDPSW.2010.5470941},}

@article{debunking,
 author = {Lee, Victor W. and Kim, Changkyu and Chhugani, Jatin and Deisher, Michael and Kim, Daehyun and Nguyen, Anthony D. and Satish, Nadathur and Smelyanskiy, Mikhail and Chennupaty, Srinivas and Hammarlund, Per and Singhal, Ronak and Dubey, Pradeep},
 title = {Debunking the 100X GPU vs. CPU myth: an evaluation of throughput computing on CPU and GPU},
 journal = {SIGARCH Comput. Archit. News},
 issue_date = {June 2010},
 volume = {38},
 number = {3},
 month = jun,
 year = {2010},
 issn = {0163-5964},
 pages = {451--460},
 numpages = {10},
 url = {http://doi.acm.org/10.1145/1816038.1816021},
 doi = {10.1145/1816038.1816021},
 acmid = {1816021},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {cpu architecture, gpu architecture, performance analysis, performance measurement, software optimization, throughput computing},
}
@article{blas,
 author = {Lawson, C. L. and Hanson, R. J. and Kincaid, D. R. and Krogh, F. T.},
 title = {Basic Linear Algebra Subprograms for Fortran Usage},
 journal = {ACM Trans. Math. Softw.},
 issue_date = {Sept. 1979},
 volume = {5},
 number = {3},
 month = sep,
 year = {1979},
 issn = {0098-3500},
 pages = {308--323},
 numpages = {16},
 url = {http://doi.acm.org/10.1145/355841.355847},
 doi = {10.1145/355841.355847},
 acmid = {355847},
 publisher = {ACM},
 address = {New York, NY, USA},
} 
@article{blas-updated,
    author = {L. S. Blackford and J. Demmel and J. Dongarra and I. Duff and S. Hammarling and G. Henry and M. Heroux and L. Kaufman and A. Lumsdaine and A. Petitet and R. Pozo and K. Remington and R. C. Whaley},
    title = {An Updated Set of Basic Linear Algebra Subprograms (BLAS)},
    journal = {ACM Transactions on Mathematical Software},
    year = {2001},
    volume = {28},
    pages = {135--151}
}
@manual{cuda,
	author = "{NVIDIA Corporation}",
	title = "{NVIDIA CUDA C Programming Guide v5.0}",
	month = "October",
	year = 2012,
	url = {http://docs.nvidia.com/cuda/pdf/CUDA_C_Programming_Guide.pdf}
}
@manual{cublas,
	author = "{NVIDIA Corporation}",
	title = "{NVIDIA CUBLAS Library v5.0}",
	month = "October",
	year = 2012,
	url = {http://docs.nvidia.com/cuda/pdf/CUDA_CUBLAS_Users_Guide.pdf}
}
@inproceedings{ms-gaxpy,
  added-at = {2011-03-29T00:00:00.000+0200},
  author = {Kestur, Srinidhi and Davis, John D. and Williams, Oliver},
  biburl = {http://www.bibsonomy.org/bibtex/2f0c6d8d7b2bb083a1553ce44bb2c89f7/dblp},
  booktitle = {ISVLSI},
  crossref = {conf/isvlsi/2010},
  ee = {http://dx.doi.org/10.1109/ISVLSI.2010.84},
  interhash = {c33cb13679bf8add999c2d4aa64175cc},
  intrahash = {f0c6d8d7b2bb083a1553ce44bb2c89f7},
  keywords = {dblp},
  pages = {288-293},
  publisher = {IEEE Computer Society},
  timestamp = {2011-03-29T00:00:00.000+0200},
  title = {BLAS Comparison on FPGA, CPU and GPU.},
  url = {http://dblp.uni-trier.de/db/conf/isvlsi/isvlsi2010.html#KesturDW10},
  year = 2010
}
@book{matmul,
 author = {Golub, Gene H. and Van Loan, Charles F.},
 title = {Matrix computations (3rd ed.)},
 year = {1996},
 isbn = {0-8018-5414-8},
 publisher = {Johns Hopkins University Press},
 address = {Baltimore, MD, USA},
} 
@inproceedings{gpgpu,
 author = {Fan, Zhe and Qiu, Feng and Kaufman, Arie and Yoakum-Stover, Suzanne},
 title = {GPU Cluster for High Performance Computing},
 booktitle = {Proceedings of the 2004 ACM/IEEE conference on Supercomputing},
 series = {SC '04},
 year = {2004},
 isbn = {0-7695-2153-3},
 pages = {47--},
 url = {http://dx.doi.org/10.1109/SC.2004.26},
 doi = {10.1109/SC.2004.26},
 acmid = {1049991},
 publisher = {IEEE Computer Society},
 address = {Washington, DC, USA},
 keywords = {GPU cluster, data intensive computing, lattice Boltzmann model, urban airborne dispersion, computational fluid dynamics},
} 
@online{gpuorg,
author = {http://gpgpu.org/},
title = {General-purpose computation on graphics hardware},
month = {april},
year = {2013},
url = {http://gpgpu.org/}
}
